In [19]:
import pandas as pd
data = pd.read_csv("TravelInsurancePrediction.csv")
data.head()
Out[19]:
Unnamed: 0 Age Employment Type GraduateOrNot AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad TravelInsurance
0 0 31 Government Sector Yes 400000 6 1 No No 0
1 1 31 Private Sector/Self Employed Yes 1250000 7 0 No No 0
2 2 34 Private Sector/Self Employed Yes 500000 4 1 No No 1
3 3 28 Private Sector/Self Employed Yes 700000 3 1 No No 0
4 4 28 Private Sector/Self Employed Yes 700000 8 1 Yes No 0
In [20]:
data.drop(columns=["Unnamed: 0"], inplace=True)
In [21]:
data.isnull().sum()
Out[21]:
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64
In [22]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 139.8+ KB
In [23]:
data["TravelInsurance"] = data["TravelInsurance"].map({0: "Not Purchased", 1: "Purchased"})
In [55]:
import plotly.express as px
figure = px.histogram(data, x = "Employment Type", 
                      color = "TravelInsurance", 
                      title= "Factors Affecting Purchase of Travel Insurance: Employment Type")
figure.show()
In [56]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
data = data
figure = px.histogram(data, x = "Age", 
                      color = "TravelInsurance", 
                      title= "Factors Affecting Purchase of Travel Insurance: Age")
figure.show()
In [54]:
import plotly.express as px
data = data
figure = px.histogram(data, x = "AnnualIncome", 
                      color = "TravelInsurance", 
                      title= "Factors Affecting Purchase of Travel Insurance: Income")
figure.show()
In [27]:
import numpy as np
data["GraduateOrNot"] = data["GraduateOrNot"].map({"No": 0, "Yes": 1})
data["FrequentFlyer"] = data["FrequentFlyer"].map({"No": 0, "Yes": 1})
data["EverTravelledAbroad"] = data["EverTravelledAbroad"].map({"No": 0, "Yes": 1})
x = np.array(data[["Age", "GraduateOrNot", 
                   "AnnualIncome", "FamilyMembers", 
                   "ChronicDiseases", "FrequentFlyer", 
                   "EverTravelledAbroad"]])
y = np.array(data[["TravelInsurance"]])
In [28]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
predictions = model.predict(xtest)
In [29]:
from sklearn.metrics import accuracy_score
print(accuracy_score(ytest,predictions))
0.8090452261306532
In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
In [31]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=6, n_estimators=100, oob_score=True)
rf.fit(xtrain, ytrain)
C:\Users\Nicholas Bagwandeen\AppData\Local\Temp\ipykernel_35044\3342771140.py:2: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[31]:
RandomForestClassifier(max_depth=6, n_jobs=-1, oob_score=True, random_state=42)
In [32]:
rf.oob_score_
Out[32]:
0.8316554809843401
In [33]:
y_pred = rf.predict(xtest)
print(accuracy_score(ytest, y_pred))
0.8592964824120602
In [34]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
In [35]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = [1,2,3,4,5]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4, 5,6,7,8,9, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2,3, 4,5,6,7,8,9,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
               
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': [1, 2, 3, 4, 5],
 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
 'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000]}
In [36]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(xtrain, ytrain)
Fitting 3 folds for each of 500 candidates, totalling 1500 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:926: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[36]:
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=500,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': [1, 2, 3, 4, 5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10],
                                        'n_estimators': [100, 311, 522, 733,
                                                         944, 1155, 1366, 1577,
                                                         1788, 2000]},
                   random_state=42, verbose=2)
In [37]:
rf_random.best_params_
Out[37]:
{'n_estimators': 1366,
 'min_samples_split': 4,
 'min_samples_leaf': 2,
 'max_features': 2,
 'max_depth': 10,
 'bootstrap': True}
In [38]:
y_pred = rf_random.predict(xtest)
In [39]:
accuracy_score(ytest, y_pred)
Out[39]:
0.8542713567839196
In [40]:
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [8, 9, 10, 11, 12],
    'max_features': [1 ,2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2,3,4,5,6],
    'n_estimators': [1200,1250,1300,1350,1400]
}# Create a based model
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
In [41]:
# Fit the grid search to the data
grid_search.fit(xtrain, ytrain)
grid_search.best_params_
Fitting 3 folds for each of 1875 candidates, totalling 5625 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:926: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[41]:
{'bootstrap': True,
 'max_depth': 9,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1200}
In [43]:
rf = RandomForestClassifier(bootstrap=True, max_depth=9, max_features=3, min_samples_leaf=2, min_samples_split=4, n_estimators=1200)
rf.fit(xtrain,ytrain)
y_pred = rf.predict(xtest)
accuracy_score(ytest, y_pred)
C:\Users\Nicholas Bagwandeen\AppData\Local\Temp\ipykernel_35044\2290841162.py:2: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

Out[43]:
0.8542713567839196
In [ ]:
 
In [ ]: